In [ ]:
from IPython.display import display, HTML
display(HTML('''
<div style="
background-color: #e6ffe6;
padding: 20px;
border-radius: 12px;
text-align: center;
">
<div style="
font-size: 48px;
font-weight: bold;
color: #006400;
text-decoration: underline;
">
Airbnb Price Prediction
</div>
</div>
'''))
Airbnb Price Prediction
Objective¶
This project focuses on analyzing and modeling Airbnb listings to understand the key drivers of rental prices. By leveraging machine learning techniques, the goal is to help hosts and potential guests gain insights into pricing patterns and improve price transparency across different cities.
Airbnb_price_prediction_video_link = https://drive.google.com/file/d/18bpOWv6Jl81yTlQ6LQdjP5QiPd9VoH8G/view?usp=sharing¶
1. Importing Libraries¶
In [ ]:
!pip install ydata-profiling --quiet
!pip install catboost --quiet
!pip install squarify
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/62.0 kB ? eta -:--:-- ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.0/62.0 kB 3.8 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 400.1/400.1 kB 13.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 296.5/296.5 kB 21.0 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 679.0/679.0 kB 39.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 37.7/37.7 MB 41.2 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.4/105.4 kB 7.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.3/43.3 kB 3.1 MB/s eta 0:00:00 Building wheel for htmlmin (setup.py) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.2/99.2 MB 8.4 MB/s eta 0:00:00 Collecting squarify Downloading squarify-0.4.4-py3-none-any.whl.metadata (600 bytes) Downloading squarify-0.4.4-py3-none-any.whl (4.1 kB) Installing collected packages: squarify Successfully installed squarify-0.4.4
In [ ]:
#importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import squarify
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings("ignore")
Upgrade to ydata-sdk
Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.
2. Data Import & Description¶
In [ ]:
# Load datasets
airbnb_df = pd.read_csv("Airbnb_data.csv", engine='python', on_bad_lines='skip')
In [ ]:
airbnb_df.head(5)
Out[ ]:
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | ... | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | thumbnail_url | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6901257 | 5.010635 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 3 | 1.0 | Real Bed | strict | True | ... | 40.696524 | -73.991617 | Beautiful brownstone 1-bedroom | Brooklyn Heights | 2 | 100.0 | https://a0.muscache.com/im/pictures/6d7cbbf7-c... | 11201 | 1.0 | 1.0 |
| 1 | 6304928 | 5.129899 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 7 | 1.0 | Real Bed | strict | True | ... | 40.766115 | -73.989040 | Superb 3BR Apt Located Near Times Square | Hell's Kitchen | 6 | 93.0 | https://a0.muscache.com/im/pictures/348a55fe-4... | 10019 | 3.0 | 3.0 |
| 2 | 7919400 | 4.976734 | Apartment | Entire home/apt | {TV,"Cable TV","Wireless Internet","Air condit... | 5 | 1.0 | Real Bed | moderate | True | ... | 40.808110 | -73.943756 | The Garden Oasis | Harlem | 10 | 92.0 | https://a0.muscache.com/im/pictures/6fae5362-9... | 10027 | 1.0 | 3.0 |
| 3 | 13418779 | 6.620073 | House | Entire home/apt | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | Real Bed | flexible | True | ... | 37.772004 | -122.431619 | Beautiful Flat in the Heart of SF! | Lower Haight | 0 | NaN | https://a0.muscache.com/im/pictures/72208dad-9... | 94117 | 2.0 | 2.0 |
| 4 | 3808709 | 4.744932 | Apartment | Entire home/apt | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 1.0 | Real Bed | moderate | True | ... | 38.925627 | -77.034596 | Great studio in midtown DC | Columbia Heights | 4 | 40.0 | NaN | 20009 | 0.0 | 1.0 |
5 rows × 29 columns
In [ ]:
#(no. of rows , no. of columns)
airbnb_df.shape
Out[ ]:
(74111, 29)
In [ ]:
airbnb_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 74111 entries, 0 to 74110 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 74111 non-null int64 1 log_price 74111 non-null float64 2 property_type 74111 non-null object 3 room_type 74111 non-null object 4 amenities 74111 non-null object 5 accommodates 74111 non-null int64 6 bathrooms 73911 non-null float64 7 bed_type 74111 non-null object 8 cancellation_policy 74111 non-null object 9 cleaning_fee 74111 non-null bool 10 city 74111 non-null object 11 description 74111 non-null object 12 first_review 58247 non-null object 13 host_has_profile_pic 73923 non-null object 14 host_identity_verified 73923 non-null object 15 host_response_rate 55812 non-null object 16 host_since 73923 non-null object 17 instant_bookable 74111 non-null object 18 last_review 58284 non-null object 19 latitude 74111 non-null float64 20 longitude 74111 non-null float64 21 name 74111 non-null object 22 neighbourhood 67239 non-null object 23 number_of_reviews 74111 non-null int64 24 review_scores_rating 57389 non-null float64 25 thumbnail_url 65895 non-null object 26 zipcode 73143 non-null object 27 bedrooms 74020 non-null float64 28 beds 73980 non-null float64 dtypes: bool(1), float64(7), int64(3), object(18) memory usage: 15.9+ MB
- Profile Report of all the features
In [ ]:
# Create the report
profile = ProfileReport(airbnb_df, title="Airbnb Dataset Profile Report", explorative=True)
# Show it directly in notebook
profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
0%| | 0/29 [00:00<?, ?it/s] 3%|▎ | 1/29 [00:00<00:10, 2.59it/s] 14%|█▍ | 4/29 [00:00<00:03, 6.28it/s] 17%|█▋ | 5/29 [00:11<01:16, 3.19s/it] 41%|████▏ | 12/29 [00:32<00:51, 3.05s/it] 100%|██████████| 29/29 [00:35<00:00, 1.23s/it]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]